## process/clean Bluechip data, prepare for estimating policy rules

library(dplyr)
library(lubridate)
library(zoo)
library(tidyr)
library(readr)

## load raw BCFF data
load("data/bcff_raw_May23.RData")

## summary stats
vars <- c("ff", "y2", "cpi", "rgdp")
varnames <- c("Federal funds rate", "Two-year yield", "CPI inflation", "Output growth")
cols <- c("Mean", "SD", "Min", "Max", "N")
tbl <- matrix(NA, length(vars), length(cols))
colnames(tbl) <- cols
rownames(tbl) <- varnames
for (j in 1:length(vars)) {
    x <- data[[vars[j]]]
    tbl[j, ] <- c(mean(x, na.rm=TRUE),
                  sd(x, na.rm=TRUE),
                  min(x, na.rm=TRUE),
                  max(x, na.rm=TRUE),
                  sum(!is.na(x)))
}
print(tbl)

N <- data %>% group_by(date, h) %>% summarize(n=n()) %>% pull(n)
cat("Range for number of forecasters:", range(N), "\n")
cat("Sample period:", format(range(data$date)), "\n")

bc <- data %>%
    mutate(id = factor(name)) %>%
    select(date, id, h,
           ff, tb3, y1, y2, y5, y10, mb30, baa,
           cpi, rgdp, gdppi) %>%
    arrange(date, id, h)

##################################################
## process macro forecasts
## utility functions
get_fmoq <- function(m) (m-1) %/% 3 * 3 + 1 # first month of quarter
get_fdate <- function(date, h) { # relevant forecast quarter (first day)
    fdate <- date + 3*months(h)
    make_date(year(fdate), get_fmoq(month(fdate)), 1)
}
get_lqdate <- function(date) { # last quarter (first day)
    lqdate <- date - 3*months(1)
    make_date(year(lqdate), get_fmoq(month(lqdate)), 1)
}

### calculate four-quarter CPI inflation forecast
## (1) get quarterly CPI inflation rates
cpi <- read_csv("data/CPIAUCSL.csv", col_names = c("date", "cpi"), skip = 1) %>%
    mutate(cpi = 100*((cpi / dplyr::lag(cpi, 3))^4 - 1)) %>% # annualized quarterly CPI inflation rate
    filter(month(date) %% 3 == 0,  ## only end of quarter
           year(date) > 1980) %>%
    mutate(date = make_date(year(date), month(date) - 2, 1)) %>% # use beginning of quarter date
    select(date, cpi)
## (2) add past realized CPI inflation
surveys <- bc %>%
    select(date) %>% distinct %>%
    expand_grid(h = -3:-1) %>%
    mutate(fdate = get_fdate(date, h)) %>%
    left_join(cpi %>% rename(fdate = date, cpirel = cpi)) %>%  # add released CPI inflation
    select(-fdate)
bc <- bc %>%
    group_by(date, id) %>%
    group_modify(~ add_row(.x, h=-3:-1)) %>%  # add rows for last three quarters
    ungroup %>%
    left_join(surveys)

## (3) calculate four-quarter inflation forecast, using realized values when h < 4
## geomean <- function(x) exp(mean(log(x))) # 100*(geomean(x/100+1)-1)   # geometric mean of gross rates is almost the same
bc <- bc %>%
    mutate(cpi1q = cpi,  # save original forecasts
           cpi = ifelse(h < 0, cpirel, cpi)) %>%
    group_by(date, id) %>%
    arrange(h) %>%
    mutate(cpi = na.approx(cpi, na.rm=FALSE),  # linearly interpolate inside missing values
           cpicum = rollapply(cpi, width=4, FUN=mean , align="right", fill = NA)) %>%
    ungroup
bc <- bc %>%
    filter(h >= 0) %>%
    mutate(cpi = cpicum) %>%
    select(-cpirel, -cpicum)

## cumulative real GDP growth
bc <- bc %>%
    group_by(date, id) %>%
    arrange(h) %>%
    mutate(rgdp = na.approx(rgdp, na.rm=FALSE), # linearly interpolate inside missing values
           rgdpcum = cumprod(1 + rgdp/100)^(1/4)) %>%  # cumulative gross GDP growth
    ungroup

bak <- bc

### output gap forecasts
## (1) real-time real GDP from ALFRED
gdp <- read_tsv("data/alfred/GDPC1.txt", na = ".") %>%
    rename(date = observation_date) %>%
    pivot_longer(-date, names_to = "vin", values_to = "gdp", names_prefix = "GDPC1_") %>%
    mutate(vin = as.Date(vin, format="%Y%m%d")) %>%
    na.omit
## determine units for each vintage
gdpunits <- read.fwf("data/alfred/GDPC1_UNITS.txt", width = c(72, 13, 13), col.names = c("units", "start", "end")) %>% as_tibble %>%
    mutate(across(everything(), stringr::str_trim),  # trim whitespaces
           units = readr::parse_number(units),   # simplify units description (only number)
           across(c(start, end), as.Date))
gdpvintages <- gdp %>% select(vin) %>% distinct %>% arrange(vin) %>%
    mutate(units = gdpunits$units[findInterval(vin, gdpunits$start)]) %>%
    rename(gdpvin = vin, gdpunits = units)

## (2) real-time potential GDP projections from ALFRED
pot <- read_tsv("data/alfred/GDPPOT.txt", na=".") %>%
    rename(date = observation_date) %>%
    pivot_longer(-date, names_to = "vin", values_to = "pot", names_prefix = "GDPPOT_") %>%
    mutate(vin = as.Date(vin, format="%Y%m%d")) %>%
    na.omit %>%
    arrange(vin)
## determine units for each vintage
potunits <- read.fwf("data/alfred/GDPPOT_UNITS.txt", width = c(72, 13, 13), col.names = c("units", "start", "end")) %>% as_tibble %>%
    mutate(across(everything(), stringr::str_trim),
           units = readr::parse_number(units),   # simplify units description (only number)
           across(c(start, end), as.Date))
potvintages <- pot %>% select(vin) %>% distinct %>%
    mutate(units = potunits$units[findInterval(vin, potunits$start)])

## for each survey, find last quarter GDP value
##  use latest vintage before survey date, or otherwise earliest vintage after survey date
surveys <- bc %>% select(date) %>% distinct %>%
    mutate(gdpdate = get_lqdate(date))
## if last GDP numbers not yet released, use most recent
surveys$gdpdate <- pmin(surveys$gdpdate, max(gdp$date))
surveys$gdpvin <- as.Date(NA)
for (t in 1:nrow(surveys)) {
    vins <- gdp %>% filter(date == surveys$gdpdate[t]) %>% pull(vin)
    if (min(vins) > surveys$date[t]) {
        surveys$gdpvin[t] <- as.Date(min(vins))
    } else {
        surveys$gdpvin[t] <- max(vins[vins <= surveys$date[t]])
    }
}
surveys <- surveys %>%
    left_join(gdp %>% rename(gdpvin = vin, gdpdate = date)) %>%  # add GDP number
    left_join(gdpvintages) %>%  # and units
    ## find most recent vintage of potential GDP projections
    mutate(potvin = potvintages$vin[pmax(findInterval(date, potvintages$vin), 1)]) %>%
    left_join(potvintages %>% rename(potvin = vin, potunits = units))
## deal with different units: when most recent potential GDP does not have same units as real GDP, use first potential GDP vintage with same units (pretend it was available in real time)
## surveys %>% filter(gdpunits != potunits) %>% print(n=Inf) # look at surveys where units aren't matching
for (j in 1:nrow(surveys)) {
    if (surveys$gdpunits[j] != surveys$potunits[j]) {
        potvinnew <- potvintages %>% filter(units == surveys$gdpunits[j]) %>% pull(vin) %>% min
        surveys$potvin[j] <- potvinnew
        surveys$potunits[j] <- surveys$gdpunits[j]
    }
}

## add potential GDP projections
surveys <- surveys %>%
    expand_grid(h = -1:5) %>%
    mutate(fdate = get_fdate(date, h)) %>%  # calculate relevant future quarter
    left_join(pot %>% rename(potvin = vin, fdate = date))

## real-time estimate of output gap
gap <- surveys %>%
    filter(gdpdate == fdate) %>%
    mutate(gap = 100*(gdp - pot)/pot) %>%
    select(fdate, gdp, pot, gap) %>%
    distinct %>%
    rename(date = fdate)

## merge current GDP and pot. GDP forecasts into surveys, construct output gap forecasts
bc <- bc %>%
    inner_join(surveys %>% filter(h >= 0) %>% select(date, h, gdp, pot)) %>%
    mutate(gdp = gdp*rgdpcum,  # gdp level forecast
           gap = 100*(gdp - pot)/pot) %>%
    arrange(date, id, h)     # nice sorting

save(bc, file="data/bc_May23.RData")
haven::write_dta(bc, path="data/bc_May23.dta")
